library(SummarizedExperiment)
library(dplyr)
library(ggplot2)
library(knitr)
library(DT)
library(limma)
library(EnhancedVolcano)
library(fgsea)
library(msigdbr)
library(gage)
library(pathview)
Load the Summarized Experiment data object, extract clinical information, log-transformed FPKM expression data, along with annotations. Prepare the gene expression data for analysis.
# Load your SE result and extract clinical data , expression data and annotation
# Load SE obj
se <- readRDS("~/BHK lab/Pancreas-cancer/output data/SE_CIA.rds")
# Extract Clinical data
clin <- data.frame(colData(se)) #dim is 50 x 16
# Extract the expression data
expr_fpkm <- assays(se)[["gene_expression"]] #dim 51627 x 50
# Extract the annotation
annot <- data.frame(rowData(se)) #dim is 51627 x 15
# Display first few rows of the dataset
DT::datatable(round(expr_fpkm[1:8, 1:7], 3))
Preparing expression data (RNA-seq) for analysis:
Total 17354 genes remain for downstream analyses.
# Step 1: Restrict to Protein-Coding Genes.
annot_proteincoding <- annot[annot$gene_type == "protein_coding",] # 19477 protein coding genes.
expr_fpkm<- expr_fpkm[rownames(expr_fpkm) %in% rownames(annot_proteincoding),]
dim(expr_fpkm)
## [1] 19477 50
# Step 2: Filter Low/Zero Expression Genes]
#data is log2(TPM+1)
r <- as.numeric(apply(expr_fpkm, 1, function(i) sum(round(i, 6) == round(log2(1), 6))))
# Get the indices of rows to remove
remove <- which(r > dim(expr_fpkm)[2] * 0.5)
# Remove rows from the matrix
expr_fpkm <- expr_fpkm[-remove, ] #dim is 17354 x 50
DT::datatable(round(expr_fpkm[1:8, 1:8], 3))
result: only one DE genes
# Limma approach applied.
design <- model.matrix(~ clin$low_high)
fit <- lmFit(expr_fpkm, design)
fit <- eBayes(fit)
# Sort by Pvalue.
top.table <- topTable(fit, sort.by = "P", n = Inf)
# To see How many DE gene are there.
length(which(top.table$adj.P.Val < 0.05)) # only one DE genes.
## [1] 1
# Preview
DT::datatable(data.frame(round(data.frame(top.table),3)))
# Save top table to .txt file.
top.table$Gene <- rownames(top.table)
write.table(top.table, file = "~/BHK lab/Pancreas-cancer/output data/CIA_Limma_Fit.txt", row.names = F, sep = "\t", quote = F)
# 1. Volcano plot.
# 1.1 Volcano Plot based on P value
EnhancedVolcano(top.table,
lab = rownames(top.table),
x = 'logFC',
y = 'P.Value',
pCutoff = 0.05,
FCcutoff = 1.5,
xlim = c(-5, 5),
ylim = c(0, -log10(10e-4)),
title= 'Volcano Plot based on P value'
)
# 1.2 Volcano Plot based on FDR
# adjusted p-values as red
EnhancedVolcano(top.table,
lab = rownames(top.table),
x = 'logFC',
y = 'adj.P.Val',
pCutoff = 0.05,
FCcutoff = 1.5,
xlim = c(-5, 5),
ylim = c(0, -log10(10e-4)),
legendLabels=c('Not sig.','Log FC','adj p-value',
'adj p-value & Log FC'),
title= 'Volcano Plot based on FDR',)
# Load human hallmark gene sets from MSigDB
pathwaysDF <- msigdbr("Homo sapiens", category="H")
# Create a list of gene symbols for each pathway
pathwaysH <- split(as.character(pathwaysDF$gene_symbol), pathwaysDF$gs_name)
# 1. Create ranks
# Create 'ranks' vector from 'top.table' with logFC as values and gene names as names.
ranks <- top.table$logFC
names(ranks) <- top.table$Gene
ranks <- sort(ranks, decreasing = T)
# Preview the data table of ranks.
DT::datatable(data.frame(ranks))
# 2. Bar Plot
# Display the ranked fold changes.
barplot(ranks)
# 3. Conduct analysis
# Run fgsea with 'pathwaysH' and 'ranks'.
fgseaRes <- fgsea(pathwaysH, ranks)
fgseaRes <- na.omit(fgseaRes)
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES)) #order by NES
# Combine genes in leadingEdge with semicolons
fgseaResTidy$leadingEdge <- sapply(fgseaResTidy$leadingEdge, paste, collapse = ";")
fgseaResTidy <- fgseaResTidy[order(fgseaResTidy$padj), ]
# Show in a Tidy table.
head(fgseaResTidy)
## # A tibble: 6 × 8
## pathway pval padj log2err ES NES size leadingEdge
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr>
## 1 HALLMARK_HYPOXIA 3.96e-11 1.98e-9 0.851 -0.576 -2.13 188 ANGPTL4;PL…
## 2 HALLMARK_EPITHELIAL_M… 3.85e- 8 9.62e-7 0.720 -0.522 -1.94 195 PCOLCE2;AR…
## 3 HALLMARK_KRAS_SIGNALI… 1.64e- 6 2.04e-5 0.644 -0.494 -1.82 189 SLPI;ANGPT…
## 4 HALLMARK_GLYCOLYSIS 1.56e- 6 2.04e-5 0.644 -0.493 -1.83 195 ANGPTL4;GP…
## 5 HALLMARK_INFLAMMATORY… 1.23e- 5 1.23e-4 0.593 -0.467 -1.73 196 EREG;CCL17…
## 6 HALLMARK_APICAL_JUNCT… 2.25e- 5 1.88e-4 0.576 -0.469 -1.73 190 COL17A1;LA…
# Save fgseaRes as txt file.
write.table(fgseaResTidy, "~/BHK lab/Pancreas-cancer/output data/CIA_fgseaRes_HRank.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Mark "significant" if adjusted p-value <= 0.05, else "non-significant".
fgseaResTidy$adjPvalue <- ifelse(fgseaResTidy$padj <= 0.05, "significant", "non-significant")
# Define colors for significance: grey for non-significant and red for significant results.
cols <- c("non-significant" = "grey", "significant" = "red")
# Using ggplot: reorder pathways by NES, fill bars based on significance, flip coordinates for readability.
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES, fill = adjPvalue)) + geom_col() +
scale_fill_manual(values = cols) + coord_flip() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x = "Pathway", y = "Normalized Enrichment Score", title = "Hallmark pathways NES
Enrichment Score from GSEA")
# 4. GSEA Results Table Plot.
# Plot GSEA table for significant pathways (padj < 0.05) using a GSEA parameter of 0.5.
plotGseaTable(pathwaysH[fgseaRes$pathway[fgseaRes$padj < 0.05]], ranks, fgseaRes, gseaParam=0.5)
# 5. Enrichment score plot.
# For significant pathways # 18 pathways.
# Store the list of significant pathways (padj < 0.05).
significant_pathways <- fgseaRes$pathway[fgseaRes$padj < 0.05]
# Use a for loop to iterate and plot enrichment for each significant pathway
for (i in significant_pathways) {
enrichment_plot <- plotEnrichment(pathwaysH[[i]], ranks)
enrichment_plot <- enrichment_plot + ggtitle(paste("Enrichment Plot for Pathway:",i))
print(enrichment_plot)
}
# Plot significant pathways as points with a gradient color based on p-value and size based on a 'size' variable.
ggplot(fgseaResTidy[fgseaResTidy$padj < 0.05, ], aes(y=reorder(pathway, size),x= NES)) + geom_point(aes(color=padj,size=size)) +
scale_color_gradientn(colours = c("red","blue")) +
labs(x='Normalized Enrichment Score', y=NULL ) +
theme( axis.title = element_text(face='bold'), axis.text = element_text (face='bold') )
To do KEGG pathway analysis, both KEGG_MEDICUS and KEGG_LEGACY are downloaded. The cutoff of 0.05 is applied for visualization, while the text files include the results across all gene sets.
619 canonical pathways gene sets derived from the KEGG MEDICUS pathway database.
# Load KEGG_MEDICUS pathways.
pathwaysKEG <- gmtPathways("~/BHK lab/Pancreas-cancer/GSEA/c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt")
# Perform fgsea with KEGG pathways
fgseaRes_kegg <- fgsea(pathwaysKEG, ranks)
# Organize and sort KEGG results by NES
fgseaResTidy_kegg <- fgseaRes_kegg %>%
as_tibble() %>%
arrange(desc(NES))
# Combine genes in leadingEdge with semicolons
fgseaResTidy_kegg$leadingEdge <- sapply(fgseaResTidy_kegg$leadingEdge, paste, collapse = ";")
# Show in a Tidy table of keg.
head(fgseaResTidy_kegg)
## # A tibble: 6 × 8
## pathway pval padj log2err ES NES size leadingEdge
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr>
## 1 KEGG_MEDICUS_REFERENCE_… 3.98e-5 0.00352 0.557 0.962 1.96 6 AMY2A;AMY2…
## 2 KEGG_MEDICUS_REFERENCE_… 9.08e-3 0.268 0.381 0.793 1.83 10 GNMT;MAT1A…
## 3 KEGG_MEDICUS_REFERENCE_… 5.87e-3 0.204 0.407 0.586 1.76 30 MCM7;POLE;…
## 4 KEGG_MEDICUS_REFERENCE_… 1.78e-2 0.395 0.352 0.598 1.71 25 FGFR3;FGFR…
## 5 KEGG_MEDICUS_ENV_FACTOR… 4.32e-2 0.486 0.322 0.636 1.69 16 GSTM1;GSTA…
## 6 KEGG_MEDICUS_REFERENCE_… 2.33e-2 0.437 0.352 0.733 1.64 9 C6;CLU;C7;…
# Save fgseaRes_keg as txt file.
write.table(fgseaResTidy_kegg, "~/BHK lab/Pancreas-cancer/output data/CIA_fgseaRes_kegRank.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Mark "significant" if adjusted p-value <= 0.05, else "non-significant".
fgseaResTidy_kegg$adjPvalue <- ifelse(fgseaResTidy_kegg$padj <= 0.05, "significant", "non-significant")
# Create a ggplot of significant pathways, with bars colored based on adjusted p-value significance.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(reorder(pathway, NES), NES, fill = adjPvalue)) +
geom_col() +
scale_fill_manual(values = cols) +
coord_flip() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x = "Pathway", y = "Normalized Enrichment Score", title = "KEGG pathways NES Enrichment Score from GSEA")
# Plot significant pathways as points with a gradient color based on p-value and size based on a 'size' variable.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(y=reorder(pathway, size),x= NES)) + geom_point(aes(color=padj,size=size)) +
scale_color_gradientn(colours = c("red","blue")) +
labs(x='Normalized Enrichment Score', y=NULL ) +
theme( axis.title = element_text(face='bold'), axis.text = element_text (face='bold') )
## KEGG_LEGACY
186 canonical pathways gene sets derived from the KEGG pathway database. These are considered Legacy gene sets since the introduction of the gene sets based on the more recent KEGG MEDICUS data.
# Load KEGG_MEDICUS pathways.
pathwaysKEG_LEG <- gmtPathways("~/BHK lab/Pancreas-cancer/GSEA/c2.cp.kegg_legacy.v2023.2.Hs.symbols.gmt")
# Perform fgsea with KEGG pathways
fgseaRes_kegg <- fgsea(pathwaysKEG, ranks)
# Organize and sort KEGG results by NES
fgseaResTidy_kegg <- fgseaRes_kegg %>%
as_tibble() %>%
arrange(desc(NES))
# Combine genes in leadingEdge with semicolons
fgseaResTidy_kegg$leadingEdge <- sapply(fgseaResTidy_kegg$leadingEdge, paste, collapse = ";")
# Show in a Tidy table of keg.
head(fgseaResTidy_kegg)
## # A tibble: 6 × 8
## pathway pval padj log2err ES NES size leadingEdge
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr>
## 1 KEGG_MEDICUS_REFERENCE_… 5.17e-5 0.00457 0.557 0.962 1.98 6 AMY2A;AMY2…
## 2 KEGG_MEDICUS_REFERENCE_… 6.44e-3 0.270 0.407 0.793 1.87 10 GNMT;MAT1A…
## 3 KEGG_MEDICUS_REFERENCE_… 1.14e-2 0.320 0.381 0.586 1.74 30 MCM7;POLE;…
## 4 KEGG_MEDICUS_REFERENCE_… 3.45e-2 0.551 0.322 0.598 1.71 25 FGFR3;FGFR…
## 5 KEGG_MEDICUS_REFERENCE_… 5.81e-2 0.551 0.322 0.733 1.68 9 C6;CLU;C7;…
## 6 KEGG_MEDICUS_ENV_FACTOR… 4.25e-2 0.551 0.322 0.636 1.67 16 GSTM1;GSTA…
# Save fgseaRes_keg as txt file.
write.table(fgseaResTidy_kegg, "~/BHK lab/Pancreas-cancer/output data/CIA_fgseaRes_keggLEGRank.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Mark "significant" if adjusted p-value <= 0.05, else "non-significant".
fgseaResTidy_kegg$adjPvalue <- ifelse(fgseaResTidy_kegg$padj <= 0.05, "significant", "non-significant")
# Create a ggplot of significant pathways, with bars colored based on adjusted p-value significance.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(reorder(pathway, NES), NES, fill = adjPvalue)) +
geom_col() +
scale_fill_manual(values = cols) +
coord_flip() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x = "Pathway", y = "Normalized Enrichment Score", title = "KEGG (legacy) pathways NES Enrichment Score from GSEA")
# Plot significant pathways as points with a gradient color based on p-value and size based on a 'size' variable.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(y=reorder(pathway, size),x= NES)) + geom_point(aes(color=padj,size=size)) +
scale_color_gradientn(colours = c("red","blue")) +
labs(x='Normalized Enrichment Score', y=NULL ) +
theme( axis.title = element_text(face='bold'), axis.text = element_text (face='bold') )
To identifies whether certain gene sets are over-represented in a target gene group.
# Set a vector of differentially expressed genes for ORA
deGenes <- rownames(top.table)[top.table$adj.P.Val < 0.05] # 1 genes
# Defining the gene universe as all genes considered in the analysis
geneUniverse <- rownames(expr_fpkm)
## Performing ORA
# ORA for Hallmark pathways
oraHallmark <- fora(pathways = pathwaysH, genes = deGenes, universe = geneUniverse, minSize = 15, maxSize = 500)
oraHallmark$Pathway_Set <- "Hallmark"
oraHallmark$Significance <- ifelse(oraHallmark$padj < 0.05, "Significant", "Not Significant")
# ORA for KEGG_MEDICUS pathways
oraKeggMed <- fora(pathways = pathwaysKEG, genes = deGenes, universe = geneUniverse, minSize = 15, maxSize = 500)
oraKeggMed$Pathway_Set <- "KEGG_MEDICUS"
oraKeggMed$Significance <- ifelse(oraKeggMed$padj < 0.05, "Significant", "Not Significant")
# ORA for KEGG_LEGACY pathways
oraKeggLeg <- fora(pathways = pathwaysKEG_LEG, genes = deGenes, universe = geneUniverse, minSize = 15, maxSize = 500)
oraKeggLeg$Pathway_Set <- "KEGG_LEGACY"
oraKeggLeg$Significance <- ifelse(oraKeggLeg$padj < 0.05, "Significant", "Not Significant")
# Display the top results for each pathway set
# DT::datatable(head(oraHallmark,
# options = list(scrollX = TRUE, scrollY = TRUE, pageLength = 4, autoWidth = TRUE))
DT::datatable(oraHallmark)
DT::datatable(oraKeggMed)
DT::datatable(oraKeggLeg)
# oraHallmark
# Create an enrichment heatmap
ggplot(oraHallmark, aes(x = pathway, y = reorder(pathway, padj), fill = -log10(padj))) +
geom_tile() +
scale_fill_gradientn(colors = c("blue", "yellow", "red"), na.value = "grey50", trans = "reverse") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(fill = "-log10(p-adj)", x = "Pathway", y = "") +
ggtitle("Enrichment Heatmap of Top Pathways")
#ORA Kegg
# Create an enrichment heatmap
ggplot(oraKeggLeg[1:50, 1:8], aes(x = pathway, y = reorder(pathway, padj), fill = -log10(padj))) +
geom_tile() +
scale_fill_gradientn(colors = c("blue", "yellow", "red"), na.value = "grey50", trans = "reverse") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(fill = "-log10(p-adj)", x = "Pathway", y = "") +
ggtitle("Enrichment Heatmap of Top Pathways")
# Create an enrichment heatmap
ggplot(oraKeggMed[1:50, 1:8], aes(x = pathway, y = reorder(pathway, padj), fill = -log10(padj))) +
geom_tile() +
scale_fill_gradientn(colors = c("blue", "yellow", "red"), na.value = "grey50", trans = "reverse") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(fill = "-log10(p-adj)", x = "Pathway", y = "") +
ggtitle("Enrichment Heatmap of Top Pathways")